{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#coding=utf-8\n",
    "import codecs\n",
    "import re\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import rdflib"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "赵云的字是什么？\n",
      "赵云\n",
      "曹丕的父亲是谁？\n",
      "曹丕\n",
      "父亲\n",
      "谁\n"
     ]
    }
   ],
   "source": [
    "# 基于模板匹配的问答系统\n",
    "quest1 = \"赵云的字是什么？\"\n",
    "# template1 = re.compile(r\"赵云\")\n",
    "template1 = re.compile(r\"(\\S[^的]*)的字是什么？\")\n",
    "matches = re.search(template1,quest1)\n",
    "if matches: \n",
    "    print(matches.group(0))                          # full match\n",
    "    print(matches.group(1))                          # match group1\n",
    "\n",
    "    \n",
    "# 更普适的一个模板\n",
    "quest2 = \"曹丕的父亲是谁？\"\n",
    "# template1 = re.compile(r\"赵云\")\n",
    "template2 = re.compile(r\"(\\S[^的]*)的(\\S[^是]*)是(\\S[^?]*)？\")\n",
    "matches2 = re.search(template2,quest2)\n",
    "if matches: \n",
    "    print(matches2.group(0))                          # full match\n",
    "    print(matches2.group(1))                          # match group1\n",
    "    print(matches2.group(2))                          # match group2\n",
    "    print(matches2.group(3))                          # match group3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('谁', '父亲', '曹操')\n"
     ]
    }
   ],
   "source": [
    "quest3 = \"谁的父亲是曹操？\"\n",
    "print(re.search(template2,quest3).groups())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('谁', '父亲', '曹操')\n"
     ]
    }
   ],
   "source": [
    "quest4 = \"曹丕的什么是曹操？\"\n",
    "print(re.search(template2,quest3).groups())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "谁的父亲是曹操\n"
     ]
    }
   ],
   "source": [
    "print(re.sub(template2,r\"\\1的父亲是\\3\",quest3))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "这里提取出来的三个对象，实际上就是三元组的组成部分，我们因此可以把这个问句改写成一个查询，从RDF中找到我们需要的信息。\n",
    "\n",
    "以下RDF知识图谱是根据三国在线：http://www.e3ol.com/ 上的资料整理得到。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Graph identifier=N4627b3985f854faf8219d89212722be2 (<class 'rdflib.graph.Graph'>)>"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "g = rdflib.Graph()\n",
    "g.parse(\"./data/MISC.ttl\",format=\"turtle\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "query_words = [\"谁\",\"什么\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def to_query(template,question):\n",
    "    global query_words\n",
    "    matches = re.search(template,question).groups()\n",
    "    is_query_entities = [int(x in query_words) for x in matches]\n",
    "    N_query_entities = sum(is_query_entities)\n",
    "    quest_placeholders = [\"\",\"\",\"\",\"\",\"\",\"\"]\n",
    "    if N_query_entities == 0: \n",
    "        return \"\"\n",
    "    cnt = 0\n",
    "    for i,entity in enumerate(matches):\n",
    "        if is_query_entities[i]:\n",
    "            quest_placeholders[i] = \"?x\"+str(i)\n",
    "            quest_placeholders[i+3] = \"?x\"+str(i)\n",
    "            cnt += 1\n",
    "        else:\n",
    "            quest_placeholders[i] = \"\"\n",
    "            quest_placeholders[i+3] = \"ns1:\" + entity\n",
    "    query0 = \"\"\"\n",
    "    PREFIX ns1: <http://www.e3ol.com/biography/html/> \n",
    "    select %s %s %s\n",
    "    where {\n",
    "    %s %s %s.\n",
    "    }\n",
    "    \"\"\" % (quest_placeholders[0],quest_placeholders[1],quest_placeholders[2],quest_placeholders[3],quest_placeholders[4],quest_placeholders[5])\n",
    "    return query0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def answer(question):\n",
    "    template = re.compile(r\"(\\S[^的]*)的(\\S[^是]*)是(\\S[^?]*)？\")\n",
    "    global query_words\n",
    "    matches = re.search(template,question).groups()\n",
    "    is_query_entities = [int(x in query_words) for x in matches]\n",
    "    query0 = to_query(template,question)\n",
    "    if query0 == \"\": \n",
    "        print(\"没有问题或模板不匹配\")\n",
    "        return\n",
    "    results = g.query(query0)\n",
    "    ns1 = \"http://www.e3ol.com/biography/html/\"\n",
    "    shorten = lambda x: str(x)[len(ns1):]\n",
    "    if len(results) > 0:\n",
    "        for record in results:\n",
    "            cnt = 0\n",
    "            answer_placeholders = [\"\",\"\",\"\"]\n",
    "            for i,entity in enumerate(matches):\n",
    "                if is_query_entities[i]:\n",
    "                    answer_placeholders[i] = shorten(record[cnt])\n",
    "                    cnt += 1\n",
    "                else:\n",
    "                    answer_placeholders[i] = '\\\\'+str(i+1)\n",
    "            answer_temp = r\"{}的{}是{}\".format(*answer_placeholders)\n",
    "            print(re.sub(template,answer_temp,question))\n",
    "    else:\n",
    "        print(\"没有找到结果或模板不匹配\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "    PREFIX ns1: <http://www.e3ol.com/biography/html/> \n",
      "    select ?x0  \n",
      "    where {\n",
      "    ?x0 ns1:父亲 ns1:曹操.\n",
      "    }\n",
      "    \n"
     ]
    }
   ],
   "source": [
    "print(to_query(r\"(\\S[^的]*)的(\\S[^是]*)是(\\S[^?]*)？\",\"谁的父亲是曹操？\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "曹彰的父亲是曹操\n",
      "曹丕的父亲是曹操\n",
      "曹植的父亲是曹操\n",
      "曹昂的父亲是曹操\n"
     ]
    }
   ],
   "source": [
    "answer(\"谁的父亲是曹操？\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "曹丕的父亲是曹操\n"
     ]
    }
   ],
   "source": [
    "answer(\"曹丕的什么是曹操？\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "曹丕的父亲是曹操\n"
     ]
    }
   ],
   "source": [
    "answer(\"曹丕的父亲是谁？\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "更有挑战性的双查询"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "曹操的子女是曹整\n",
      "曹操的子女是曹华\n",
      "曹操的配偶是李姬\n",
      "曹操的子女是曹植\n",
      "曹操的配偶是刘氏[曹操妾]\n",
      "曹操的子女是曹玹\n",
      "曹操的子女是曹茂\n",
      "曹操的母亲是丁氏\n",
      "曹操的子女是曹矩\n",
      "曹操的配偶是秦夫人\n",
      "曹操的子女是曹昂\n",
      "曹操的子女是曹节[献穆皇后]\n",
      "曹操的配偶是赵姬\n",
      "曹操的配偶是王昭仪\n",
      "曹操的子女是曹彪\n",
      "曹操的子女是曹衮\n",
      "曹操的子女是曹乘\n",
      "曹操的配偶是宋姬[曹操妃]\n",
      "曹操的配偶是孙姬\n",
      "曹操的配偶是武宣皇后\n",
      "曹操的子女是曹京\n",
      "曹操的子女是曹棘\n",
      "曹操的子女是曹丕\n",
      "曹操的子女是曹干\n",
      "曹操的子女是曹据\n",
      "曹操的子女是曹上\n",
      "曹操的配偶是刘姬\n",
      "曹操的官至是丞相\n",
      "曹操的子女是曹均\n",
      "曹操的子女是金乡公主\n",
      "曹操的子女是曹宇\n",
      "曹操的官至是谥曰武\n",
      "曹操的配偶是陈氏[曹操妾]\n",
      "曹操的子女是安阳公主\n",
      "曹操的子女是曹彰\n",
      "曹操的官至是追魏太祖武皇帝\n",
      "曹操的曾效力过是东汉\n",
      "曹操的官至是魏王\n",
      "曹操的子女是曹峻\n",
      "曹操的子女是曹冲\n",
      "曹操的子女是曹徽\n",
      "曹操的子女是曹林\n",
      "曹操的兄弟是曹德\n",
      "曹操的子女是广宗殇公\n",
      "曹操的配偶是周姬\n",
      "曹操的子女是曹铄\n",
      "曹操的配偶是尹夫人\n",
      "曹操的兄弟是曹彬\n",
      "曹操的子女是曹宪\n",
      "曹操的配偶是丁夫人\n",
      "曹操的配偶是刘夫人\n",
      "曹操的父亲是曹嵩\n",
      "曹操的配偶是杜夫人\n",
      "曹操的子女是曹熊\n"
     ]
    }
   ],
   "source": [
    "answer(\"曹操的什么是谁？\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:py36]",
   "language": "python",
   "name": "conda-env-py36-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
